Load data

cafe = read_csv(here::here("data/Sidewalk_Caf__Licenses_and_Applications_clean.csv"))
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   zip = col_double(),
##   swc_sq_ft = col_double(),
##   swc_tables = col_double(),
##   swc_chairs = col_double(),
##   lat = col_double(),
##   long = col_double(),
##   community_district = col_double(),
##   city_council_district = col_double(),
##   app_sq_ft = col_double(),
##   app_tables = col_double(),
##   app_chairs = col_double(),
##   app_status_date = col_datetime(format = ""),
##   expiration_date = col_datetime(format = ""),
##   app_too_date = col_datetime(format = ""),
##   submit_date = col_datetime(format = ""),
##   intake_dd = col_datetime(format = ""),
##   send_package_dd = col_datetime(format = ""),
##   cp_dd = col_datetime(format = ""),
##   cb_dd = col_datetime(format = ""),
##   hearing_dd = col_datetime(format = "")
##   # ... with 4 more columns
## )
## See spec(...) for full column specifications.
parking = read_csv(
  here::here("data/parking_vio2021_cleanv1.csv")) %>% 
  filter(hour != 12.3) 
## Parsed with column specification:
## cols(
##   id = col_double(),
##   summons_number = col_double(),
##   registration_state = col_character(),
##   issue_date = col_datetime(format = ""),
##   violation_code = col_double(),
##   vehicle_make = col_character(),
##   hour = col_double(),
##   min = col_character(),
##   violation_county = col_character(),
##   house_number = col_character(),
##   street_name = col_character(),
##   intersecting_street = col_character(),
##   vehicle_color = col_character(),
##   vehicle_year = col_double(),
##   address = col_character(),
##   long = col_double(),
##   lat = col_double(),
##   borough = col_character(),
##   fine_amount = col_double()
## )

Cafe Map

plot_cafe_map =
  parking %>%
  count(street_name, name = "ticket") %>%
  right_join(cafe) %>%
  mutate(ticket = replace_na(ticket,1e-10))
## Joining, by = "street_name"
pal = colorNumeric(palette = c("viridis", "magma", "inferno","plasma")[[4]],
                   domain = plot_cafe_map$ticket %>% log())

plot_cafe_map =
  plot_cafe_map%>%
  mutate(pop =
           str_c("<b>",business_name,"</b><br>",round(ticket)," tickets")) %>% 
  leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(
    ~ long,
    ~ lat,
    color = ~pal(ticket %>% log()),
    radius = .1,
    popup = ~ (pop)
  )

plot_cafe_map

##Geo vs ticket count

parking %>% 
  group_by(borough, lat, long) %>% 
  summarize(tickets_count = n()) %>% 
  mutate(text_label = str_c("Tickets count:", tickets_count, "Borough:", borough)) %>% 
  plot_ly(x = ~lat, y = ~long, z = ~tickets_count, text = ~ text_label) %>%
  add_markers(color = ~tickets_count,
              alpha = 0.0001) 
## `summarise()` regrouping output by 'borough', 'lat' (override with `.groups` argument)
## Warning: Ignoring 1 observations
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
parking %>%
  dplyr::select(long, lat, hour) %>%
  mutate(long = abs(long)) %>%
  drop_na() %>%
  with(., MASS::kde2d(lat, long)) %>%
  as_tibble() %>%
  plot_ly(x = ~ x, y = ~ y, z = ~ z) %>%
  add_surface() %>%
  layout(scene = list(
    yaixs = list(autorange = "reversed"),
    zaxis = list(
      range = c(1, 150),
      title = "",
      showline = FALSE,
      showticklabels = FALSE,
      showline = FALSE,
      showgrid = FALSE
    ),
    camera = list(eye = list(
      x = -1.25, y = 1.25, z = 1.25
    )),
    showlegend = FALSE
  ))
parking %>%
  ggplot(aes(x = long, y = lat)) +
  geom_hex() +
  theme(
    legend.position = "right") +
  labs(
    title = "tickets counts show in georaphic information") 
## Warning: Removed 246 rows containing non-finite values (stat_binhex).

Violation vs Time

1.The distribution of tickets quantities over year

parking %>% 
  mutate(month = lubridate::month(issue_date),
         month = forcats::fct_reorder(as.factor(month),month)
         ) %>% 
  drop_na(month,borough) %>% 
  group_by(borough,month) %>% 
  summarize(tickets = n()) %>% 
  ungroup() %>%
  mutate(borough = forcats::fct_reorder(borough,tickets,sum)) %>% 
  ggplot(aes(x = month, y = tickets, fill = borough,group = borough)) + 
    geom_bar(stat = "identity",position = "dodge") +
    labs(
      x = 'Issuing Month',
      y = 'counts',
      title = 'The distribution of tickets quantities over month ')
## `summarise()` regrouping output by 'borough' (override with `.groups` argument)

Conclusion: This dataset has nearly 39 million observations, most of which are recorded during June, July, August, and September.

2.plots: violation vs weekday

parking_day = parking %>%
  dplyr::select(issue_date, summons_number, borough) %>% 
  drop_na(borough)

day_order = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

parking_day %>% 
  mutate(day_week = weekdays(issue_date)) %>%
  group_by(day_week, borough) %>% 
  summarize(n = n()) %>%
  mutate(day_week = factor(day_week, levels = day_order)) %>% 
  arrange(day_week) %>% 
  plot_ly(x = ~ day_week, y = ~ n, type = 'scatter', mode = "line", color = ~borough) %>%
  layout(
    title = 'Violations by weekday',
    xaxis = list(
      type = 'category',
      title = 'Weekday'),
    yaxis = list(
      title = 'Count of violations'))
## `summarise()` regrouping output by 'day_week' (override with `.groups` argument)
## Warning: Ignoring 3 observations

Conclusion: The volume of parking tickets issuing during weekdays are larger than volume of those issuing during weekends. This could be attibuted to two reasons:
1. the overall demand for parking in NYC are smaller during weekend than during weekdays 2. Many parking spots are designed to be free parking during weekends.

All the five boroughs in NYC–Manhattan, Bronx, Brooklyn, Queens and Staten Island have the same trend in the term of parking tickets generating. Manhattan always has the highest parking tickets volume, and follows with Brooklyn, Queens, Bronx and Staten Island.

3.vilation vs hour

parking %>% 
  group_by(hour) %>% 
  summarize(n = n()) %>%
  plot_ly(x = ~hour, y = ~n, type = 'scatter',mode = 'line') %>%
  layout(
    title = 'Violations per Hour',
    xaxis = list(
      type = 'category',
      title = 'Hour',
      range = c(0, 23)),
    yaxis = list(
      title = 'Count of violations'))
## `summarise()` ungrouping output (override with `.groups` argument)
parking %>% 
  filter(hour < 23) %>% 
  drop_na(borough) %>% 
  ggplot(aes(x = hour, fill = borough)) +
  geom_bar(stat = "count") +
  labs(title = 'Violations per Hour for each borough') +
  ylab('Count of tickets') +
  scale_x_continuous(breaks=seq(0, 23, 1))

During a day, the parking tickets generating has two peaks– around 8:00 am and aroung 1:00 pm. This may caused by the high parking demand especially during meal time.

vio_code = readxl::read_xlsx('data/ParkingViolationCodes_January2020.xlsx') %>% 
  janitor::clean_names() %>% 
  select(violation_code, violation_description) %>% 
  mutate(violation_description = str_to_lower(violation_description))

vio_count_boro = 
  parking %>% 
  left_join(vio_code, by = 'violation_code')


vio_count_boro %>% 
  group_by(hour,borough) %>% 
  summarise(n = n()) %>% 
  plot_ly(x = ~hour, y = ~n, type = 'scatter',mode = 'line', color= ~ borough) %>%
  layout(
    title = 'Violations per Hour',
    xaxis = list(
      type = 'category',
      title = 'Hour',
      range = c(0, 23)),
    yaxis = list(
      title = 'Count of violations'))
## `summarise()` regrouping output by 'hour' (override with `.groups` argument)

##Overall violation type

violation_count = 
  parking %>% 
  group_by(violation_code) %>% 
  summarise(n = n()) %>% 
  left_join(vio_code, by = 'violation_code') %>% 
  arrange(desc(n)) %>% 
  filter(n > 10000)
## `summarise()` ungrouping output (override with `.groups` argument)

bar_plot:

violation_count %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) 

The top ten

##violation type by Borough

Manhattan_vio = 
  vio_count_boro %>% 
  filter(borough == 'Manhattan') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Manhattan') +
    xlab("violation description") +
    ylab('Count of tickets')  
## `summarise()` ungrouping output (override with `.groups` argument)
Queens_vio = 
  vio_count_boro %>% 
  filter(borough == 'Queens') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Queens') +
    xlab("violation description") +
    ylab('Count of tickets') 
## `summarise()` ungrouping output (override with `.groups` argument)
Bronx_vio = 
  vio_count_boro %>% 
  filter(borough == 'Bronx') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Bronx') +
    xlab("violation description") +
    ylab('Count of tickets')  
## `summarise()` ungrouping output (override with `.groups` argument)
Brooklyn_vio = 
  vio_count_boro %>% 
  filter(borough == 'Brooklyn') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Brooklyn') +
    xlab("violation description") +
    ylab('Count of tickets') 
## `summarise()` ungrouping output (override with `.groups` argument)
Staten_vio = 
  vio_count_boro %>% 
  filter(borough == 'Staten Island') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Staten Island') +
    xlab("violation description") +
    ylab('Count of tickets')  
## `summarise()` ungrouping output (override with `.groups` argument)
Manhattan_vio

Queens_vio 

Bronx_vio

Brooklyn_vio

Staten_vio

Anything to do with veihicle_make?

parking %>% 
  select(id, vehicle_make) %>% 
  group_by(vehicle_make) %>% 
  summarise(n = n()) %>% 
  filter(n>10) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  mutate(vehicle_make = fct_reorder(vehicle_make, n)) %>% 
  plot_ly(x = ~vehicle_make, y = ~n, type = "bar", colors = "viridis")
## `summarise()` ungrouping output (override with `.groups` argument)

According to the distribution of vehicle make of the parking ticket dataset, we can roughly know the tops 10 vehicle make that are most popular among new yorkers.

Some plot/description/test about fine amount

Generate data near by cafe

get_nearby_area =
  function(p_long = 0, p_lat = 0,area=100) {
    return(
      tibble(
        long_lwr = p_long - area / (6378137 * cos(pi * p_lat / 180)) * 180 / pi,
        long_upr = p_long + area / (6378137 * cos(pi * p_lat / 180)) * 180 /
          pi,
        lat_lwr = p_lat - area / 6378137 * 180 / pi,
        lat_upr = p_lat + area / 6378137 * 180 / pi
      )
    )
  }

get_data =
  function(area) {
    data =
      parking %>%
      bind_cols(area) %>%
      filter(long < long_upr,
             long > long_lwr,
             lat < lat_upr,
             lat > lat_lwr) %>% 
  select(summons_number, issue_date, violation_code,hour, vehicle_color, fine_amount)
    
    return(data)
  }

nearby_data = 
  cafe %>%
  select(business_name, business_name2, lat, long, borough)%>%
  mutate(
    nearby = map2(.x = long, .y = lat, ~ get_nearby_area(.x, .y)),
    nearby = map(.x = nearby,  ~ get_data(.x),
    ticket_n = map_dbl(nearby, nrow))) %>% 
  unnest(nearby)
  1. fine amount by boro
boro_amount = parking %>% 
  drop_na(fine_amount, borough)%>%
  select(borough, fine_amount)%>%
  group_by(borough) 

boro_amount %>%
  summarize(mean = mean(fine_amount)) %>% 
  knitr::kable()
## `summarise()` ungrouping output (override with `.groups` argument)
borough mean
Bronx 65.9
Brooklyn 62.4
Manhattan 68.0
Queens 58.8
Staten Island 66.0
boro_amount %>%
  ggplot(aes(x = borough, y = fine_amount, fill = borough)) +
  geom_boxplot() +
  ylim(0,200)+
  theme(
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) 
## Warning: Removed 2188 rows containing non-finite values (stat_boxplot).

  1. fine amount near cafe
nearby_data %>% 
  group_by(business_name) %>% 
  summarize(mean_fine = mean(fine_amount)) %>% 
  left_join(distinct(cafe), by = "business_name") %>% 
  drop_na(mean_fine) %>%
  mutate(pop =
           str_c("<b>",business_name,"</b><br>","Average fine amount:" ,round(mean_fine, digits = 2),"$")) %>% 
  leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(
    ~ long,
    ~ lat,
    color = ~pal(mean_fine %>% log()),
    radius = .1,
    popup = ~ (pop)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
  1. fine amount by weekday

fine amount vs weekday

day_order = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

parking %>%
  dplyr::select(issue_date, fine_amount, borough) %>% 
  drop_na(issue_date) %>% 
  mutate(day_week = weekdays(issue_date),
         day_week = factor(day_week, levels = day_order)) %>% 
  arrange(day_week) %>% 
  ggplot(aes(x = day_week, y = fine_amount, fill = day_week)) +
  geom_boxplot() +
  ylim(0,200) +
  theme(
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) 
## Warning: Removed 2984 rows containing non-finite values (stat_boxplot).

fine amount vs weekday faceted by borough

parking %>%
  dplyr::select(issue_date, fine_amount, borough) %>% 
  drop_na(issue_date, borough) %>% 
  mutate(day_week = weekdays(issue_date),
         day_week = factor(day_week, levels = day_order)) %>% 
  arrange(day_week) %>% 
  ggplot(aes(x = day_week, y = fine_amount, fill = day_week)) +
  geom_boxplot() +
  facet_grid(~borough) +
  ylim(0,200) +
  theme(
    legend.position = "right",
    legend.text = element_text(size = 8),
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1)
  ) 
## Warning: Removed 2738 rows containing non-finite values (stat_boxplot).

  1. Risk plot

Initial Design about how to study the correlation of tickets and cafe.

  • problem: we have location information(lat, long) for both tickets and cafe. but we cannot define their correlation because they are continuous

  • solution: – discrete the NYC district by euqally distribute it into smaller boxes with same area.
    – consider the cafe and tickets are correlated if their location fall into same box

  • some special case: – new york is not a square: we simply choose a square big enough to cover the whole new york. – if N tickets and M cafe are in the same box: distribute the correlation coefficient averagely.